Wk 08: Sentiment Analysis

Overview

Here is some sample code to get sentiment data from bing and nrc dictionaries.

The data

Code
library(tidyverse)
load("data/tmm_comments.Rdata")

glimpse(data)
Rows: 38,661
Columns: 6
$ `Letter #`          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
$ `Organization Name` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `Letter Text`       <chr> "The proposal to renew expired mineral leases on t…
$ ...4                <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ...5                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ...6                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Sentiment analysis

Code
library(janitor)
data <- data %>%
  clean_names()

glimpse(data)
Rows: 38,661
Columns: 6
$ letter_number     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
$ organization_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ letter_text       <chr> "The proposal to renew expired mineral leases on the…
$ x4                <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ x5                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ x6                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

Letter text

Code
head(data$letter_text, n = 1)
[1] "The proposal to renew expired mineral leases on the Superior National Forest near the Boundary Waters shows a disregard for science, economics, and public opinion. The U.S. Forest Service rejected these very leases in 2016 because allowing mining to proceed would be inconsistent with its obligation to manage and protect these lands and waters for future generations. The Trump Administration has reversed the U.S. Forest Services prior decisions and illegally put the project into the hands of the Interior Departments Bureau of Land Management. The U.S. Forest Service has an obligation to protect the Boundary Waters. The BLMs inadequate environmental review rejects the U.S. Forest Services authority and ignores the overwhelming science and economics that support long-term mining protections for this area. Refusing to renew the expired mineral leases is the way to ensure that the conservation and protection of the Boundary Waters continues. The BLM must at least prepare a full environmental impact statement that includes a no-lease-renewal alternative and thoroughly examines the significant impacts of mining should it decide to proceed with reckless and illegal lease renewal.Thank you."
Code
library(tidytext)

#install.packages("textdata")
library(textdata)

AFINN

Code
# get AFINN sentiment dictionary (you may have to download)
get_sentiments("afinn")
# A tibble: 2,477 × 2
   word       value
   <chr>      <dbl>
 1 abandon       -2
 2 abandoned     -2
 3 abandons      -2
 4 abducted      -2
 5 abduction     -2
 6 abductions    -2
 7 abhor         -3
 8 abhorred      -3
 9 abhorrent     -3
10 abhors        -3
# ℹ 2,467 more rows

Bing

Code
# get bing sentiment dictionary
get_sentiments("bing")
# A tibble: 6,786 × 2
   word        sentiment
   <chr>       <chr>    
 1 2-faces     negative 
 2 abnormal    negative 
 3 abolish     negative 
 4 abominable  negative 
 5 abominably  negative 
 6 abominate   negative 
 7 abomination negative 
 8 abort       negative 
 9 aborted     negative 
10 aborts      negative 
# ℹ 6,776 more rows

NRC

Code
# get nrc dictionary (you may have to download)
get_sentiments("nrc")
# A tibble: 13,872 × 2
   word        sentiment
   <chr>       <chr>    
 1 abacus      trust    
 2 abandon     fear     
 3 abandon     negative 
 4 abandon     sadness  
 5 abandoned   anger    
 6 abandoned   fear     
 7 abandoned   negative 
 8 abandoned   sadness  
 9 abandonment anger    
10 abandonment fear     
# ℹ 13,862 more rows

Setting up our data

Code
data <- data %>%
  select(letter_number, organization_name, letter_text)
Code
tokens <- data %>%
  unnest_tokens(word, letter_text)

head(tokens)
# A tibble: 6 × 3
  letter_number organization_name word    
          <dbl> <chr>             <chr>   
1             1 <NA>              the     
2             1 <NA>              proposal
3             1 <NA>              to      
4             1 <NA>              renew   
5             1 <NA>              expired 
6             1 <NA>              mineral 

Get bing sentiment scores

In the following code, we

Code
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"), by = c(word = "word")) %>%
  group_by(letter_number) %>%
  summarise(bing_sentiment = sum(sentiment == "positive") - sum(sentiment == "negative"))


bing_sentiments
# A tibble: 38,347 × 2
   letter_number bing_sentiment
           <dbl>          <int>
 1             1             -5
 2             2             -5
 3             3             -5
 4             4             -3
 5             5             -5
 6             6             -5
 7             7             -5
 8             8             -5
 9             9             -5
10            10             -5
# ℹ 38,337 more rows

Get NRC sentiment scores

Code
# Calculate sentiment scores using the NRC lexicon
nrc_sentiments <- tokens %>%
  inner_join(get_sentiments("nrc"), by = c(word = "word")) %>%
  group_by(letter_number, sentiment) %>%
  summarise(sentiment_count = n()) %>%  # Count the number of each sentiment
  pivot_wider(names_from = sentiment, values_from = sentiment_count, values_fill = 0) %>%
  ungroup(.)

# rename columns to include prefix "nrc_"
nrc_sentiments <- nrc_sentiments %>%
  rename_with(~ paste0("nrc_", .), -letter_number)

head(nrc_sentiments)
# A tibble: 6 × 11
  letter_number nrc_anger nrc_anticipation nrc_disgust nrc_fear nrc_negative
          <dbl>     <int>            <int>       <int>    <int>        <int>
1             1         3                4           2        3            8
2             2         3                4           2        3            8
3             3         3                4           2        3            8
4             4         2                1           0        1            2
5             5         3                4           2        3            8
6             6         3                4           2        3            8
# ℹ 5 more variables: nrc_positive <int>, nrc_sadness <int>,
#   nrc_surprise <int>, nrc_trust <int>, nrc_joy <int>
Code
# Join the sentiment scores with the original DataFrame
results <- data %>%
  left_join(bing_sentiments, by = "letter_number") %>%
  left_join(nrc_sentiments, by = "letter_number")

# View the resulting DataFrame
head(results)
# A tibble: 6 × 14
  letter_number organization_name letter_text           bing_sentiment nrc_anger
          <dbl> <chr>             <chr>                          <int>     <int>
1             1 <NA>              The proposal to rene…             -5         3
2             2 <NA>              The proposal to rene…             -5         3
3             3 <NA>              The proposal to rene…             -5         3
4             4 <NA>              The renewal of these…             -3         2
5             5 <NA>              The proposal to rene…             -5         3
6             6 <NA>              The proposal to rene…             -5         3
# ℹ 9 more variables: nrc_anticipation <int>, nrc_disgust <int>,
#   nrc_fear <int>, nrc_negative <int>, nrc_positive <int>, nrc_sadness <int>,
#   nrc_surprise <int>, nrc_trust <int>, nrc_joy <int>

Write out results for later use

Code
save(results, file = "out/tmm_sentiment_results.Rdata")

Explore sentiment results

Code
# Define the sentiment category you want to use (replace with the desired category)
sentiment_category <- "nrc_trust"

# Extract the top 5 observations for the specified sentiment category
top_5_sentiment <- results %>%
  arrange(desc({{sentiment_category}})) %>%
  slice_head(n = 5) %>%
  select(letter_number, letter_text, {{sentiment_category}})

# Print the top 5 observations
print(top_5_sentiment)
# A tibble: 5 × 3
  letter_number letter_text                                            nrc_trust
          <dbl> <chr>                                                      <int>
1             1 The proposal to renew expired mineral leases on the S…         6
2             2 The proposal to renew expired mineral leases on the S…         6
3             3 The proposal to renew expired mineral leases on the S…         6
4             4 The renewal of these leases is arbitrary and done in …         1
5             5 The proposal to renew expired mineral leases on the S…         6

Filter out repeat letters; create subsets

Code
# Create a new df with unique 'letter_text' observations
results_filtered <- results %>%
  distinct(letter_text, .keep_all = TRUE)

# create subsets for exploration
results_anger <- results_filtered %>%
  select(letter_number, letter_text, nrc_anger)
Code
library(reactable)

# create table (define columns, add parameters & formatting)
reactable(results_anger, searchable = TRUE, filterable = TRUE)
Back to top